library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(scran)
## Loading required package: scuttle
library(scater)
## Loading required package: ggplot2
sce <- readRDS("../data/SC2018/231115_sceAfterCellTypeAnnotation.rds")

Cell states

B-cells

Authors recover three subclusters. Naive B cells (IGHD+, CD27-), quiescent memory B cells (CD27+, IGHG1+, IGHA1+) and plasma cells (IGHA+, IGHG+, CD38+, MS4A1-)

sceB <- sce[,sce$cellType == "B-cell"]
sceB <- logNormCounts(sceB)
dec <- modelGeneVar(sceB)
hvgB <- getTopHVGs(dec, prop=0.05)
set.seed(1234)
sceB <- runPCA(sceB, 
              ncomponents=10, 
              subset_row=hvgB)
sceB <- runUMAP(sceB, 
               dimred = 'PCA', 
               external_neighbors = TRUE,
               ncomponents = 10,
               min_dist=0.4)
plotUMAP(sceB)

plotUMAP(sceB, colour_by="IGHD")

plotUMAP(sceB, colour_by="CD27")

plotUMAP(sceB, colour_by="IGHG1")

plotUMAP(sceB, colour_by="IGHA1")

plotUMAP(sceB, colour_by="CD38")

plotUMAP(sceB, colour_by="MS4A1")

set.seed(464688)
# Build a shared nearest-neighbor graph from PCA space
graph <- buildSNNGraph(sceB, 
                       use.dimred = 'PCA')
# Leiden clustering on the SNN graph
cluster_leidenB <- factor(igraph::cluster_leiden(graph = graph,
                                                 resolution_parameter = 0.05)$membership) 
nlevels(cluster_leidenB)
## [1] 6
colData(sceB)$cluster_leidenB <- cluster_leidenB
plotUMAP(sceB, colour_by="cluster_leidenB")

plotReducedDim(sceB, 
               dimred = "UMAP_fastMNNCorrected",
               point_size=1/2, 
               colour_by="IGHD") +
  xlim(c(2,4)) +
  ylim(c(7.5,12))
## Warning: Removed 57 rows containing missing values (`geom_point()`).

plotReducedDim(sceB, 
               dimred = "UMAP_fastMNNCorrected",
               point_size=1/2, 
               colour_by="CD27") +
  xlim(c(2,4)) +
  ylim(c(7.5,12))
## Warning: Removed 57 rows containing missing values (`geom_point()`).

plotReducedDim(sceB, 
               dimred = "UMAP_fastMNNCorrected",
               point_size=1/2, 
               colour_by="IGHG1") +
  xlim(c(2,4)) +
  ylim(c(7.5,12))
## Warning: Removed 57 rows containing missing values (`geom_point()`).

plotReducedDim(sceB, 
               dimred = "UMAP_fastMNNCorrected",
               point_size=1/2, 
               colour_by="IGHA1") +
  xlim(c(2,4)) +
  ylim(c(7.5,12))
## Warning: Removed 57 rows containing missing values (`geom_point()`).

plotReducedDim(sceB, 
               dimred = "UMAP_fastMNNCorrected",
               point_size=1/2, 
               colour_by="CD38") +
  xlim(c(2,4)) +
  ylim(c(7.5,12))
## Warning: Removed 57 rows containing missing values (`geom_point()`).

plotReducedDim(sceB, 
               dimred = "UMAP_fastMNNCorrected",
               point_size=1/2, 
               colour_by="MS4A1") +
  xlim(c(2,4)) +
  ylim(c(7.5,12))
## Warning: Removed 57 rows containing missing values (`geom_point()`).

T-cells

sceT <- sce[,sce$cellType == "T-cell"]
sceT <- logNormCounts(sceT)
dec <- modelGeneVar(sceT)
hvgT <- getTopHVGs(dec, prop=0.05)
set.seed(1234)
sceT <- runPCA(sceT, 
              ncomponents=10, 
              subset_row=hvgB)
sceT <- runUMAP(sceT, 
               dimred = 'PCA', 
               external_neighbors = TRUE,
               ncomponents = 10,
               min_dist=0.2)
plotUMAP(sceT)

plotUMAP(sceT, colour_by="GZMB")

plotUMAP(sceT, colour_by="GZMH")

plotUMAP(sceT, colour_by="CCR7")

plotUMAP(sceT, colour_by="SELL")

set.seed(464688)
# Build a shared nearest-neighbor graph from PCA space
graph <- buildSNNGraph(sceT, 
                       use.dimred = 'PCA')

# Leiden clustering on the SNN graph
cluster_leidenT <- factor(igraph::cluster_leiden(graph = graph,
                                                 resolution_parameter = 0.001)$membership) 
nlevels(cluster_leidenT)
## [1] 2
colData(sceT)$cluster_leidenT <- cluster_leidenT
plotUMAP(sceT, colour_by="cluster_leidenT")

Annotate on cell state level

cellState <- sce$cellType
cellState[sce$cellType == "B-cell"] <- paste0("B",cluster_leidenB)
cellState[sce$cellType == "T-cell"] <-  paste0("T",cluster_leidenT)
sce$cellState <- cellState
  
plotReducedDim(sce, 
               dimred = "UMAP_fastMNNCorrected",
               point_size=1/2, 
               colour_by="cellState") +
  scale_color_manual(values=dayjob::paletteDiscrete(values=unique(sce$cellState), set="bear"))
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

Differential abundance on cell state level

library(dayjob)
cellCountsLong <- getCellPopulationCounts(sce,
                                        patientVar = "sample",
                                        cellTypeVar = "cellState",
                                        group = "condition",
                                        format="long")
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ✔ purrr   0.3.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks IRanges::collapse()
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()      masks matrixStats::count()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ dplyr::slice()      masks IRanges::slice()
cellCountsWide <- getCellPopulationCounts(sce,
                                        patientVar = "sample",
                                        cellTypeVar = "cellState",
                                        group = "condition",
                                        format="wide")
countMatrix <- cellCountsWide[,-c(1:2)]
countMatrix <- t(as.matrix(countMatrix))


cellCountsLong <- cellCountsLong %>% 
  group_by(patient, group) %>%
  mutate(totalCells = sum(nCells),
         nCellTypes = n(),
         geoMean = prod(nCells+1)^(1/nCellTypes),
         CLR = log((nCells+1) / geoMean )) %>%
  ungroup() %>%
  mutate(fractionCells = nCells / totalCells)

ggplot(cellCountsLong, aes(x=celltype, y=fractionCells, fill=group)) +
  geom_boxplot(outlier.size = 0) +
  geom_point(position = position_dodge(width = .75)) +
  theme_classic()

ggplot(cellCountsLong, aes(x=celltype, y=CLR, fill=group)) +
  geom_boxplot(outlier.size = 0) +
  geom_point(position = position_dodge(width = .75)) +
  theme_classic()

voomCLR

library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:scater':
## 
##     plotMDS
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(voomCLR)
design <- model.matrix( ~ group, data=cellCountsWide)
v <- voomCLR(counts = countMatrix,
             design = design)
fit <- lmFit(v, design)
plotBeta(fit)

fit <- applyBiasCorrection(fit)
fit <- eBayes(fit)
tt <- topTable(fit, coef=2, n=Inf)
tt
##                       logFC     AveExpr             t      P.Value    adj.P.Val
## T1             1.771686e+00  2.77490391  5.401473e+00 2.724764e-05 0.0003542193
## T2            -1.184069e+00  2.71762506 -4.046665e+00 6.277716e-04 0.0040805152
## B4            -1.697260e+00 -0.31301613 -2.813365e+00 1.071560e-02 0.0464342628
## B3            -1.651372e+00 -0.02736908 -2.638513e+00 1.573147e-02 0.0511272828
## Megakaryocyte -1.300596e+00 -2.53369553 -2.119965e+00 4.667379e-02 0.1213518537
## B5            -1.371930e+00 -3.33299303 -1.583402e+00 1.289713e-01 0.2395181974
## NK             8.543210e-01  2.70712063  2.016354e+00 5.735339e-02 0.1242656808
## Erythrocyte    6.835507e-01 -0.99397083  9.709046e-01 3.431598e-01 0.4461077694
## B1            -4.524239e-01  0.05428516 -1.141569e+00 2.670800e-01 0.3857821746
## B6            -2.417788e-06 -3.65020694 -2.955746e-06 9.999977e-01 0.9999976709
## CD14 Monocyte  3.353294e-01  2.55956207  1.223873e+00 2.351836e-01 0.3821733661
## B2            -1.698421e-01 -0.53385765 -3.629499e-01 7.204393e-01 0.8514282439
## CD16 Monocyte -4.613844e-02  0.57161236 -1.080049e-01 9.150650e-01 0.9913203775
##                        B
## T1             2.5526362
## T2            -0.5634483
## B4            -2.9246573
## B3            -3.3045385
## Megakaryocyte -3.9842333
## B5            -4.7413960
## NK            -4.7888960
## Erythrocyte   -5.6551917
## B1            -5.6789602
## B6            -5.8227646
## CD14 Monocyte -5.9612331
## B2            -6.1316324
## CD16 Monocyte -6.4134795

NB-GLM

library(glmmTMB)
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.4.1
## Current Matrix version is 1.5.1
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
## Warning in checkDepPackageVersion(dep_pkg = "TMB"): Package version inconsistency detected.
## glmmTMB was built with TMB version 1.9.0
## Current TMB version is 1.9.1
## Please re-install glmmTMB from source or restore original 'TMB' package (see '?reinstalling' for more information)
resDfNB <- data.frame(population=rownames(countMatrix),
                      diff=rep(NA,nrow(countMatrix)),
                      se=rep(NA,nrow(countMatrix)),
                      pval=rep(NA,nrow(countMatrix)))
for(pp in 1:nrow(countMatrix)){
  curY <- countMatrix[pp,]
  m_p <- glmmTMB(curY ~ -1 + design, 
                 family=nbinom2(link="log"),
                 offset = log(colSums(countMatrix)))
  resDfNB[pp,2:4] <- c(summary(m_p)$coef$cond["designgroupSC",c(1,2,4)])
}
## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended
## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended

## Warning in glmmTMB(curY ~ -1 + design, family = nbinom2(link = "log"), offset =
## log(colSums(countMatrix))): use of the 'data' argument is recommended
resDfNB$padj <- p.adjust(resDfNB$pval, "fdr")

resDfNB[order(abs(resDfNB$pval)),]
##       population       diff        se         pval         padj
## 11            T1  1.5047700 0.2432410 6.157942e-10 8.005324e-09
## 12            T2 -1.2351164 0.2993443 3.690096e-05 2.398562e-04
## 4             B4 -2.1082308 0.5336825 7.803806e-05 3.381649e-04
## 5             B5 -4.0378287 1.1432488 4.126013e-04 1.340954e-03
## 3             B3 -1.8446224 0.6062794 2.345995e-03 6.099587e-03
## 9  Megakaryocyte -1.3405902 0.5577411 1.623423e-02 3.517417e-02
## 1             B1 -0.6595095 0.2991692 2.749131e-02 5.105530e-02
## 8    Erythrocyte  1.0008854 0.6247627 1.091501e-01 1.773689e-01
## 13            B6 -1.9497944 1.4289397 1.724089e-01 2.490350e-01
## 10            NK  0.4232895 0.3529722 2.304445e-01 2.995778e-01
## 7  CD16 Monocyte -0.2754860 0.3269730 3.994889e-01 4.199045e-01
## 6  CD14 Monocyte  0.1668012 0.2067743 4.198494e-01 4.199045e-01
## 2             B2 -0.3128874 0.3879153 4.199045e-01 4.199045e-01

edgeR

library(edgeR)
## 
## Attaching package: 'edgeR'
## The following object is masked from 'package:SingleCellExperiment':
## 
##     cpm
d <- DGEList(countMatrix)
d <- calcNormFactors(d)
d <- estimateDisp(d, design)
fit <- glmFit(d, design)
lrt <- glmLRT(fit, coef=2)
lrt$table$padj <- p.adjust(lrt$table$PValue, method="BH")
lrt$table[order(lrt$table$LR, decreasing=TRUE),]
##                     logFC   logCPM           LR       PValue         padj
## T1             3.13365558 18.47514 21.163945959 4.216219e-06 5.481085e-05
## B4            -2.67764773 14.31439 10.577903319 1.144474e-03 7.439080e-03
## B3            -2.62577570 14.76121  9.751696154 1.791573e-03 7.763484e-03
## T2            -1.32223773 17.69649  9.021980269 2.667522e-03 8.669448e-03
## B5            -4.37390459 11.65692  6.942957708 8.414986e-03 2.187896e-02
## Erythrocyte    2.05201491 13.36015  4.010259975 4.522418e-02 9.798572e-02
## CD14 Monocyte  0.71460969 17.29130  3.605029604 5.760504e-02 1.069808e-01
## NK             1.09614669 18.00146  2.335210158 1.264779e-01 2.055266e-01
## Megakaryocyte -1.06441856 10.85509  1.336585335 2.476371e-01 3.576980e-01
## B6            -1.94660919 10.48310  1.181032611 2.771459e-01 3.602896e-01
## B1            -0.43796408 13.74097  0.820729823 3.649669e-01 4.313245e-01
## CD16 Monocyte  0.47294014 14.91270  0.391993239 5.312535e-01 5.755247e-01
## B2             0.05942517 13.03295  0.009459798 9.225187e-01 9.225187e-01

LinDA

library(MicrobiomeStat)
library(phyloseq)
## 
## Attaching package: 'phyloseq'
## The following object is masked from 'package:SummarizedExperiment':
## 
##     distance
## The following object is masked from 'package:Biobase':
## 
##     sampleNames
## The following object is masked from 'package:GenomicRanges':
## 
##     distance
## The following object is masked from 'package:IRanges':
## 
##     distance
lindaRes <- LinDA::linda(otu.tab = countMatrix, # rows features, cols samples
                                    meta = as.data.frame(cellCountsWide),
                                    formula = '~ group',
                                    type = 'count',
                                   adaptive=TRUE,
                                   imputation = FALSE)
## Imputation approach is used.
lindaRes$output$groupSC[order(abs(lindaRes$output$groupSC$stat), decreasing=TRUE),]
##                  baseMean log2FoldChange     lfcSE        stat       pvalue
## T1             88627.7112     2.57099951 0.4292389  5.98967010 0.0001339422
## T2            466082.3323    -1.67532194 0.4151763 -4.03520567 0.0023794988
## Megakaryocyte   2524.9289    -2.00848239 0.7625106 -2.63403872 0.0249883399
## B4             31049.5217    -2.59551271 1.0333052 -2.51185488 0.0308144999
## B3             40381.0392    -2.49607224 1.0816040 -2.30775053 0.0436743400
## NK            143161.3308     1.21775589 0.7168240  1.69882130 0.1201965858
## CD14 Monocyte 163411.1792     0.52587414 0.3293556  1.59667609 0.1414219048
## B1             21301.0870    -0.65483541 0.4163475 -1.57280972 0.1468383840
## B5              1138.7079    -1.81557902 1.3366131 -1.35834295 0.2042072827
## Erythrocyte     3518.6888     1.05254155 1.2157721  0.86573916 0.4069239522
## B2             10078.6410    -0.28523903 0.5414475 -0.52680831 0.6098128715
## CD16 Monocyte  28290.0293    -0.07279074 0.6410964 -0.11354103 0.9118487789
## B6               434.8029     0.05599086 1.0784249  0.05191911 0.9596156384
##                      padj reject df
## T1            0.001741249   TRUE 10
## T2            0.015466742   TRUE 10
## Megakaryocyte 0.100147125  FALSE 10
## B4            0.100147125  FALSE 10
## B3            0.113553284  FALSE 10
## NK            0.238612374  FALSE 10
## CD14 Monocyte 0.238612374  FALSE 10
## B1            0.238612374  FALSE 10
## B5            0.294966075  FALSE 10
## Erythrocyte   0.529001138  FALSE 10
## B2            0.720687939  FALSE 10
## CD16 Monocyte 0.959615638  FALSE 10
## B6            0.959615638  FALSE 10

Summary of results

allPopulations <- rownames(countMatrix)
nPopulations <- nrow(countMatrix)
allMethods <- c("voomCLR", "LinDA", "edgeR", "NBGLM")
nMethods <- length(allMethods)
resDfAll <- data.frame(population = rep(allPopulations, nMethods),
                       method = rep(allMethods, each=nPopulations),
                       log2FC = c(tt[allPopulations,]$logFC,
                                  lindaRes$output$groupSC[allPopulations,]$log2FoldChange,
                                  lrt$table[allPopulations,]$logFC,
                                  log2(exp(resDfNB$diff))),
                       padj = c(tt[allPopulations,]$adj.P.Val,
                                lindaRes$output$groupSC[allPopulations,]$padj,
                                lrt$table[allPopulations,]$padj,
                                resDfNB$padj)
                       
)
resDfAll$DA <- resDfAll$padj <= 0.05

pComp <- ggplot(resDfAll, 
       aes(x=population, y=method, color=log2FC, size=DA)) +
  geom_point()+
  facet_grid(~population, scales = "free_x", space = "free")+
  theme_light()+
  scale_color_gradient2(low= "blue", mid="gray", high= "red") +
  theme(axis.text.x = element_text(angle=45, hjust = 1, vjust = 1),
        strip.text.x = element_text(size=14, angle=90, hjust = 1, vjust = 1))
pComp
## Warning: Using size for a discrete variable is not advised.

resDfAll$DA <- resDfAll$padj <= 0.06 # to show NB models are also at adjusted p-value around .05

pComp <- ggplot(resDfAll, 
       aes(x=population, y=method, color=log2FC, size=DA)) +
  geom_point()+
  facet_grid(~population, scales = "free_x", space = "free")+
  theme_light()+
  scale_color_gradient2(low= "blue", mid="gray", high= "red") +
  theme(axis.text.x = element_text(angle=45, hjust = 1, vjust = 1),
        strip.text.x = element_text(size=14, angle=90, hjust = 1, vjust = 1))
pComp
## Warning: Using size for a discrete variable is not advised.